package com.webull.information.center.carwler.common.util.jsoup.prase_cn;

import java.io.IOException;
import java.text.ParseException;
import java.util.Optional;

import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;

import com.webull.framework.util.UtilDate;
import com.webull.information.center.carwler.common.model.NewsInformation;
import com.webull.information.center.carwler.common.util.jsoup.HtmlBodyPrase;
import com.webull.information.center.carwler.common.util.jsoup.JsoupPraseUtil;
import com.webull.information.center.common.constants.Constants;

/**
 * 网易新闻模板
 * 
 * @author shimingjun
 * @date 2016年8月19日 上午9:35:43
 * @version 1.0
 * @since JDK 1.8
 */
public class NetEase163_HtmlPrase implements HtmlBodyPrase {
	protected final Logger logger = LogManager.getLogger(getClass());

	/**
	 * for example :http://news.163.com/16/0728/01/BT1ALRRQ00014AED.html
	 */
	@Override
	public void praseNewsInfo(org.jsoup.nodes.Document doc, NewsInformation info) {
		try {
			Element body = Optional.ofNullable(doc.getElementById("epContentLeft"))
					.orElse(Optional.ofNullable(doc.getElementsByClass("endContent").first()).orElse(null));
			// Element body = doc.getElementById("epContentLeft");

			if (body == null)
				return;
			// 设置标题
			info.setLanguage(Constants.lang_zh);
			Optional.ofNullable(body.getElementsByTag("h1").first()).map(h1 -> StringUtils.stripToNull(h1.html()))
					.ifPresent(str -> info.setTitle(str));

			// 设置soure和pushTime
			/**
			 * 16年模板
			 */
			Optional.ofNullable(body.getElementsByClass("post_time_source").first()).ifPresent(s1 -> {

				Optional.ofNullable(s1.getElementsByTag("A").first()).ifPresent(a1 -> {
					String source1 = StringUtils.stripToEmpty(a1.html());
					String source2 = StringUtils.stripToEmpty(
							StringUtils.substring(s1.html(), StringUtils.indexOfIgnoreCase(s1.html(), "</a>") + 4));
					source2 = StringUtils.isBlank(source2) ? "" : StringUtils.stripToEmpty(source2);
					// source
					Optional.ofNullable(StringUtils.stripToNull(source1 + source2))
							.ifPresent(ss2 -> info.setSourceName(ss2));
					// 2016-07-28 00:39:00
					// 2016-07-28 00:39:00 来源:
					// push time
					String pt = StringUtils.stripToEmpty(StringUtils.substring(s1.ownText(), 0, 16));
					String regex1 = "^\\d{4}-\\d{1,2}-\\d{2}(\\d|:|\\s)*$";
					if (pt.matches(regex1)) {
						info.setPushTime(pt);
						try {
							info.setNewsTime(UtilDate.parse(pt, "yyyy-MM-dd HH:mm"));
						} catch (Exception e) {
						}
					}
				});

			});

			/**
			 * 网易13/14年老版本
			 */
			if (StringUtils.isBlank(info.getSourceName())) {
				Optional.ofNullable(body.getElementsByAttributeValue("class", "ep-info cDGray").first())
						.map(es0 -> es0.getElementsByClass("left").first()).ifPresent(left -> {
							Optional.ofNullable(left.getElementsByTag("A").first()).ifPresent(a1 -> {
								String source1 = StringUtils.stripToEmpty(a1.html());

								int start1 = StringUtils.indexOfIgnoreCase(StringUtils.stripToEmpty(left.html()),
										"</a>") + 4;
								int start2 = StringUtils.lastIndexOfIgnoreCase(StringUtils.stripToEmpty(left.html()),
										"<a");
								String source2 = StringUtils.stripToEmpty(
										StringUtils.substring(StringUtils.stripToEmpty(left.html()), start1, start2));
								source2 = StringUtils.isBlank(source2) ? "" : StringUtils.stripToEmpty(source2);
								Optional.ofNullable(StringUtils.stripToNull(source1 + source2))
										.ifPresent(ss2 -> info.setSourceName(ss2));

								// push time
								String pt = StringUtils.stripToEmpty(
										StringUtils.substring(StringUtils.stripToEmpty(left.ownText()), 0, 16));
								String regex1 = "^\\d{4}-\\d{1,2}-\\d{2}(\\d|:|\\s)*$";
								if (pt.matches(regex1)) {
									info.setPushTime(pt);
									try {
										info.setNewsTime(UtilDate.parse(pt, "yyyy-MM-dd HH:mm"));
									} catch (Exception e) {
									}
								}

							});

						});

			}

			/**
			 * 网易11/12年老版本
			 */
			if (StringUtils.isBlank(info.getSourceName())) {
				// body.select("span.info").first().getElementsByAttributeValue("style",
				// "float:left;")
				Optional.ofNullable(body.select("span.info").first())
						.map(es0 -> es0.getElementsByAttributeValue("style", "float:left;").first()).ifPresent(left -> {
							Optional.ofNullable(left.getElementsByTag("A").first()).ifPresent(a1 -> {
								String source1 = StringUtils.stripToNull(a1.html());
								if (source1 != null) {
									info.setSourceName(source1);
								}
								// push time
								String pt = StringUtils.stripToEmpty(
										StringUtils.substring(StringUtils.stripToEmpty(left.ownText()), 0, 16));
								String regex1 = "^\\d{4}-\\d{1,2}-\\d{2}(\\d|:|\\s)*$";
								if (pt.matches(regex1)) {
									info.setPushTime(pt);
									try {
										info.setNewsTime(UtilDate.parse(pt, 8, "yyyy-MM-dd HH:mm"));
									} catch (Exception e) {
									}
								}

							});

						});

			}

			// 新闻正文
			Element contentA = body.getElementById("endText");
			Optional.ofNullable(contentA).map(content -> content.getElementsByTag("p")).ifPresent(ps0 -> {
				for (int i = ps0.size() - 1; i >= 0; i--) {
					Element p = ps0.get(i);
					JsoupPraseUtil.trimParagraph(p);
					if (p.hasClass("otitle") || !p.hasText()) {
						ps0.remove(i);
					}
				}
				info.setContent(StringUtils.stripToNull(ps0.outerHtml()));

			});

			// 主图
			// String main_pic = Optional.ofNullable(contentA).map(cont ->
			// cont.getElementsByTag("img")).map(imgs -> imgs.first())
			// .map(firstImg -> firstImg.attr("src")).orElse(null);
			// if (!StringUtils.isEmpty(main_pic)) {
			// try {
			// main_pic = HtmlPrase.procImgSrc(info.getUrl() != null ? new
			// URL(info.getUrl()) : null, main_pic);
			// info.setMainPic(main_pic);
			// } catch (Exception e) {
			// }
			// }
		} catch (

		Exception e) {
			logger.warn(e);
		}
	}

	public static void main(String[] args) throws ParseException, IOException {
		String url2 = "http://money.163.com/12/0615/18/842EJ5LQ002526O5.html";
		url2 = "http://money.163.com/10/0524/23/67G2F00H00251LJJ.html";
		url2 = "http://news.163.com/16/0726/01/BSS58LPP00014AED.html";
		url2 = "http://news.163.com/12/0628/01/8523414400014AED.html?f=jsearch";
		url2 = "http://money.163.com/12/0925/18/8C93QS5N00251LJJ.html";
		url2 = "http://tech.163.com/14/0222/09/9LM8J6UV000915BF.html";
		url2 = "http://news.163.com/16/0728/01/BT1ALRRQ00014AED.html";
		Connection connection = Jsoup.connect(url2).userAgent(
				"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
				// .header("x-client-data",
				// "CIq2yQEIpbbJAQjEtskBCP2VygEIwpjKAQjwnMoB")
				.header("x-client-data", RandomStringUtils.randomAlphanumeric(40));

		org.jsoup.nodes.Document doc = connection.timeout(10000).get();
		NewsInformation info = new NewsInformation();
		new NetEase163_HtmlPrase().praseNewsInfo(doc, info);
		System.out.println(info);
	}
}
